package com.yao2san.dedup.crawler;

import com.google.common.collect.Lists;
import com.yao2san.dedup.input.Input;
import com.yao2san.dedup.output.Output;
import com.yao2san.dedup.output.Result;
import com.yao2san.dedup.output.ResultSentence;
import com.yao2san.dedup.output.TargetSentence;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

public class DeDupDefaultImpl implements
        DeDup {
    private Float maxRepetitionRate;
    private WebDriver webDriver = null;
    private Input input;
    private static final String SEARCH_URL = "https://www.baidu.com/s?";
    private Output output;
    private ExecutorService executorService;
    private AtomicInteger counter = new AtomicInteger(0);
    public DeDupDefaultImpl() {
        init();
    }

    private void init() {
        System.setProperty("webdriver.chrome.driver", "D:\\download\\chromedriver_win32_2\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        Map<String, Object> prefs = new HashMap<>();
        prefs.put("profile.managed_default_content_settings.images", 2);
        prefs.put("permissions.default.stylesheet", 2);
        prefs.put("dom.ipc.plugins.enabled.libflashplayer.so", "false");
        options.setExperimentalOption("prefs", prefs);
        options.setHeadless(true);
        webDriver = new ChromeDriver(options);
        webDriver.manage().deleteAllCookies();
        webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
        webDriver.manage().timeouts().pageLoadTimeout(30,TimeUnit.SECONDS);
        //webDriver.manage().timeouts().pageLoadTimeout(10,TimeUnit.SECONDS);
        this.output = new Output();

        executorService = Executors.newCachedThreadPool();

    }

    @Override
    public void setInput(Input input) {
        this.input = input;
    }

    private void pretreatment() {
        //输入内容断句处理
        String content = this.input.getContent();
        String[] split = content.split("[.?!。？！]");
        input.setSentences(Lists.newArrayList(split));
        output.setInput(input);
    }

    @Override
    public Output getOutput() {
        pretreatment();
        Map<String, List<TargetSentence>> sentenceMap = new HashMap<>();
        getPage(this.input.getKey(), this.input.getLevel(), new CrawlFinished() {
            @Override
            public void finished(String url, Document document) {

                String targetContent = document.select("body").text();
                //分句
                String[] splits = targetContent.split("[.?!。？！]");
                //相似度
                for (String source : input.getSentences()) {

                    List<TargetSentence> targetSentences = new ArrayList<>();

                    for (String s : splits) {
                        double similarity = Similarity.getSimilarity(source, s);
                        if (similarity > input.getMaxRepetitionRate()) {
                            TargetSentence targetSentence = new TargetSentence();
                            targetSentence.setSimilarity(similarity);
                            targetSentence.setSentence(s);
                            targetSentence.setUrl(url);
                            targetSentences.add(targetSentence);
                            int index = targetContent.indexOf(s);
                            String context = targetContent.substring(index < 50 ? 0 : index - 50, index + s.length() + 50);
                            targetSentence.setContext("..." + context + "...");

                        }
                    }
                    if (sentenceMap.get(source) == null) {
                        sentenceMap.put(source, targetSentences);
                    } else {
                        sentenceMap.get(source).addAll(targetSentences);
                    }

                    //System.out.println(JSONObject.toJSONString(sentenceMap));
                }
                double percent = counter.addAndGet(1)/(input.getLevel()*10.0);
                System.out.println("计算中..."+(percent*100)+"%");
            }
        });

        ResultSentence resultSentence = new ResultSentence(sentenceMap);
        Result result = new Result();
        result.setResultSentence(resultSentence);
        double totalSimilarity = 0.0;
        int count = 0;
        int wordCount = 0;
        for (Map.Entry<String, List<TargetSentence>> listEntry : sentenceMap.entrySet()) {
            if (listEntry.getValue().size() > 0) {
                final TargetSentence ts = listEntry.getValue().stream().max((o1, o2) -> {
                    if (o1.getSimilarity() - o2.getSimilarity() > 0)
                        return 1;
                    else if (o1.getSimilarity() - o2.getSimilarity() == 0)
                        return 0;
                    else
                        return -1;
                }).get();
                totalSimilarity += ts.getSimilarity();
                count++;
                wordCount += ts.getSentence().length();
            }
        }
        totalSimilarity = (wordCount * 1.0) / input.getContent().length();
        result.setSimilarity(totalSimilarity);
        output.setResult(result);


        //System.out.println(JSONObject.toJSONString(output));
        return output;
    }


    private String buildSearchUrl(String key, int page) {
        return SEARCH_URL + "wd=" + key + "&pn=" + (page * 10);
    }

    private void getPage(String key, int page, CrawlFinished crawlFinished) {
        if (webDriver == null) {
            throw new RuntimeException("webdriver init failed");
        }

        for (int pageIndex = 0; pageIndex < page; pageIndex++) {
            final String url = buildSearchUrl(key, pageIndex);
            webDriver.get(url);
            String pageSource = webDriver.getPageSource();
            Document doc = Jsoup.parse(pageSource);
            Elements elements = doc.select("#content_left .c-container h3");
            for (Element element : elements) {
                String targetUrl = element.select("a").attr("href");
                try {
                    webDriver.get(targetUrl);
                    AtomicBoolean isWhiteUrl = new AtomicBoolean(false);
                    if (this.input.getWhiteUrls() != null) {
                        this.input.getWhiteUrls().forEach(whiteUrl -> {
                            if (webDriver.getCurrentUrl().startsWith(whiteUrl)) {
                                isWhiteUrl.set(true);
                            }
                        });
                    }
                    if (isWhiteUrl.get()) {
                        continue;
                    }
                    Document targetDocument = Jsoup.parse(webDriver.getPageSource());
                    crawlFinished.finished(webDriver.getCurrentUrl(), targetDocument);
                } catch (Exception e) {
                    e.printStackTrace();
                    try {
                        TimeUnit.SECONDS.sleep(3);
                        webDriver.navigate().back();
                    } catch (InterruptedException ex) {
                        ex.printStackTrace();
                    }
                }

            }
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        webDriver.close();
    }
}


